#!/bin/bash
export CUDA_VISIBLE_DEVICES=0

paragraph_extraction ()
{
    SOURCE_DIR=$1
    TARGET_DIR=$2
    echo "Start paragraph extraction, this may take a few hours"
    echo "Source dir: $SOURCE_DIR"
    echo "Target dir: $TARGET_DIR"

#    echo "Processing testset"
#    cat $SOURCE_DIR/testset/search.test.json | python paragraph_extraction.py test \
#            > $TARGET_DIR/testset/search.test.json
#    cat $SOURCE_DIR/testset/zhidao.test.json | python paragraph_extraction.py test \
#            > $TARGET_DIR/testset/zhidao.test.json
#
    echo "Processing trainset"
    cat $SOURCE_DIR/trainset/search.train.json | python paragraph_extraction_fake_only.py train \
            > $TARGET_DIR/trainset/search-fake.train.json
    cat $SOURCE_DIR/trainset/zhidao.train.json | python paragraph_extraction_fake_only.py train \
            > $TARGET_DIR/trainset/zhidao-fake.train.json

#    echo "Processing devset"
#    cat $SOURCE_DIR/devset/search.dev.json | python paragraph_extraction_fake.py dev \
#            > $TARGET_DIR/devset/search-fake.dev.json
#    cat $SOURCE_DIR/devset/zhidao.dev.json | python paragraph_extraction_fake.py dev \
#            > $TARGET_DIR/devset/zhidao-fake.dev.json
#
#    echo "Paragraph extraction done!"
}


PROCESS_NAME="$1"
if [ ! -d ../../data ]; then
    echo "Please download the preprocessed data first (See README - Preprocess)"
    exit 1
fi
paragraph_extraction  ../../data/ ../../data/extracted
